[1]:
import os
os.getcwd()
[1]:
'C:\\Users\\RAHUL SRIVASTAVA\\Desktop\\Learnbay\\ML\\Decision_Tree'
[46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
[47]:
dataset = pd.read_csv('50_Startups.csv')
dataset.head()
[47]:
| R&D Spend | Administration | Marketing Spend | State | Profit | |
|---|---|---|---|---|---|
| 0 | 165349.20 | 136897.80 | 471784.10 | New York | 192261.83 |
| 1 | 162597.70 | 151377.59 | 443898.53 | California | 191792.06 |
| 2 | 153441.51 | 101145.55 | 407934.54 | Florida | 191050.39 |
| 3 | 144372.41 | 118671.85 | 383199.62 | New York | 182901.99 |
| 4 | 142107.34 | 91391.77 | 366168.42 | Florida | 166187.94 |
# Data Preprocessing
Data Preprocessing¶
[48]:
# 1. Handling Missing Value
# 2. Handling Outlier - Outlier only require when wehave large tree to avoid overfitting
# Overfitting means your model is not pure it is impure means there is lots of error is coming.Thatswhy you are getting overfitting
# 3. Encoding - In tree based model always use label encoding method
# 4. Feature Scaling - Please note,there is no need to do feature scaling in DT bcoz we are just allocating whether that value will represent or not.
# 5. Imbalance Treatment
[49]:
dataset.isnull().sum()
[49]:
R&D Spend 0 Administration 0 Marketing Spend 0 State 0 Profit 0 dtype: int64
[50]:
dataset.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 R&D Spend 50 non-null float64 1 Administration 50 non-null float64 2 Marketing Spend 50 non-null float64 3 State 50 non-null object 4 Profit 50 non-null float64 dtypes: float64(4), object(1) memory usage: 2.1+ KB
[51]:
dataset['State'] = dataset['State'].astype('category')
dataset['State'] = dataset['State'].cat.codes
[52]:
dataset.head()
[52]:
| R&D Spend | Administration | Marketing Spend | State | Profit | |
|---|---|---|---|---|---|
| 0 | 165349.20 | 136897.80 | 471784.10 | 2 | 192261.83 |
| 1 | 162597.70 | 151377.59 | 443898.53 | 0 | 191792.06 |
| 2 | 153441.51 | 101145.55 | 407934.54 | 1 | 191050.39 |
| 3 | 144372.41 | 118671.85 | 383199.62 | 2 | 182901.99 |
| 4 | 142107.34 | 91391.77 | 366168.42 | 1 | 166187.94 |
# Split the data into dependent and independent variable
Split the data into dependent and independent variable¶
[53]:
x = dataset.iloc[:,0:-1]
y = dataset[['Profit']]
[54]:
x.shape,y.shape
[54]:
((50, 4), (50, 1))
# splitting the data into training and test
splitting the data into training and test¶
[63]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=1)
# Building Decision Tree Regressor Model
[64]:
from sklearn.tree import DecisionTreeRegressor
dtRegressor = DecisionTreeRegressor()
dtRegressor.fit(x_train,y_train)
[64]:
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
[65]:
# Decision Tree is non Parametric model here we don't have any Assumptions like Linear Regression
# 
# Predict Test Data with Decision Tree Regressor Model
[66]:
y_pred_train = dtRegressor.predict(x_train)
y_pred_test = dtRegressor.predict(x_test)
# Evaluate your Model
[67]:
from sklearn.metrics import r2_score
[68]:
print('Training Accuracy : ',r2_score(y_train,y_pred_train))
print()
print('Test Accuracy : ',r2_score(y_test,y_pred_test))
Training Accuracy : 1.0 Test Accuracy : 0.8978427744373091
# Since I am getting 100% in training accuracy so this is an overfitting variance is okay but 100 % I am getting training accuracy - K-Fold
Since I am getting 100% in training accuracy so this is an overfitting variance is okay but 100 % I am getting training accuracy - K-Fold¶
# Using Cross Validation Method - K fold Method
[69]:
from sklearn.model_selection import cross_val_score
Training_Accuracy = cross_val_score(dtRegressor,x_train,y_train,cv=10)
print(Training_Accuracy)
[ 0.75859449 0.8686139 0.54063583 0.87707088 0.96925619 0.26165478 0.92993926 0.27110755 -21.22435468 0.92487939]
[70]:
print(Training_Accuracy.mean())
-1.482260240613908
# linear Regression Model
[75]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_train,y_train)
[75]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
[76]:
#Predict the train and test with Linear model
y_pred_train_lr = linear.predict(x_train)
y_pred_test_lr = linear.predict(x_test)
[77]:
print('Training Accuracy : ',r2_score(y_train,y_pred_train_lr))
print()
print('Test Accuracy : ',r2_score(y_test,y_pred_test_lr))
Training Accuracy : 0.9419507593691141 Test Accuracy : 0.9616053937220067
# Random Forest Regressor Model
[83]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=500) #100 decision tree I am building that's why it is called Ensemble
rf_regressor.fit(x_train,y_train)
[83]:
RandomForestRegressor(n_estimators=500)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(n_estimators=500)
[84]:
#Predict the train and test with Linear model
y_pred_train_rf = rf_regressor.predict(x_train)
y_pred_test_rf = rf_regressor.predict(x_test)
[85]:
print('Training Accuracy : ',r2_score(y_train,y_pred_train_rf))
print()
print('Test Accuracy : ',r2_score(y_test,y_pred_test_rf))
Training Accuracy : 0.9855904843449871 Test Accuracy : 0.9438374607010358
# Classification Problem
Classification Problem¶
[88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
[89]:
df = pd.read_csv('penguins_lter.csv')
[90]:
df.head()
[90]:
| studyName | Sample Number | Species | Region | Island | Stage | Individual ID | Clutch Completion | Date Egg | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | Comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PAL0708 | 1 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A1 | Yes | 11/11/2007 | 39.1 | 18.7 | 181 | . | MALE | NaN | NaN | Not enough blood for isotopes. |
| 1 | PAL0708 | 2 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A2 | Yes | 11/11/2007 | 39.5 | 17.4 | 186 | 3800 | FEMALE | 8.94956 | -24.69454 | NaN |
| 2 | PAL0708 | 3 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N2A1 | Yes | 11/16/2007 | 40.3 | 18.0 | . | 3250 | FEMALE | 8.36821 | -25.33302 | NaN |
| 3 | PAL0708 | 4 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N2A2 | Yes | 11/16/2007 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Adult not sampled. |
| 4 | PAL0708 | 5 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N3A1 | Yes | 11/16/2007 | 36.7 | 19.3 | 193 | 3450 | FEMALE | 8.76651 | -25.32426 | NaN |
[91]:
df.shape
[91]:
(344, 17)
[92]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 344 entries, 0 to 343 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 studyName 344 non-null object 1 Sample Number 344 non-null int64 2 Species 344 non-null object 3 Region 344 non-null object 4 Island 344 non-null object 5 Stage 344 non-null object 6 Individual ID 344 non-null object 7 Clutch Completion 344 non-null object 8 Date Egg 344 non-null object 9 Culmen Length (mm) 342 non-null float64 10 Culmen Depth (mm) 342 non-null float64 11 Flipper Length (mm) 342 non-null object 12 Body Mass (g) 342 non-null object 13 Sex 334 non-null object 14 Delta 15 N (o/oo) 330 non-null float64 15 Delta 13 C (o/oo) 331 non-null float64 16 Comments 26 non-null object dtypes: float64(4), int64(1), object(12) memory usage: 45.8+ KB
[93]:
df.columns
[93]:
Index(['studyName', 'Sample Number', 'Species', 'Region', 'Island', 'Stage',
'Individual ID', 'Clutch Completion', 'Date Egg', 'Culmen Length (mm)',
'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex',
'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Comments'],
dtype='object')[94]:
df.head(2)
[94]:
| studyName | Sample Number | Species | Region | Island | Stage | Individual ID | Clutch Completion | Date Egg | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | Comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PAL0708 | 1 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A1 | Yes | 11/11/2007 | 39.1 | 18.7 | 181 | . | MALE | NaN | NaN | Not enough blood for isotopes. |
| 1 | PAL0708 | 2 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | N1A2 | Yes | 11/11/2007 | 39.5 | 17.4 | 186 | 3800 | FEMALE | 8.94956 | -24.69454 | NaN |
[95]:
#Pre Pruning Approach
df.drop(['studyName','Sample Number','Individual ID','Date Egg','Comments'],axis=1,inplace = True)
[96]:
df.head(2)
[96]:
| Species | Region | Island | Stage | Clutch Completion | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 39.1 | 18.7 | 181 | . | MALE | NaN | NaN |
| 1 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 39.5 | 17.4 | 186 | 3800 | FEMALE | 8.94956 | -24.69454 |
[97]:
df.shape
[97]:
(344, 12)
# Data Preprocessing
Data Preprocessing¶
[98]:
df.isnull().sum()
[98]:
Species 0 Region 0 Island 0 Stage 0 Clutch Completion 0 Culmen Length (mm) 2 Culmen Depth (mm) 2 Flipper Length (mm) 2 Body Mass (g) 2 Sex 10 Delta 15 N (o/oo) 14 Delta 13 C (o/oo) 13 dtype: int64
[99]:
df.isnull().sum()/len(df)*100
[99]:
Species 0.000000 Region 0.000000 Island 0.000000 Stage 0.000000 Clutch Completion 0.000000 Culmen Length (mm) 0.581395 Culmen Depth (mm) 0.581395 Flipper Length (mm) 0.581395 Body Mass (g) 0.581395 Sex 2.906977 Delta 15 N (o/oo) 4.069767 Delta 13 C (o/oo) 3.779070 dtype: float64
[100]:
#all the observation is less than 5% ,in this case we are not supposed to remove the variable,we have to do imputation method
#20% missing data or 25% missing data before dropping the variable first take the confirmation from the business stakeholder
#take their concurrence ,ask them if the data is available otherwise we have to drop that variable,otherwise it will create bias
[101]:
# Finding the unique values in given dataset
for i in df.columns:
print("*******************************"
,i,"*******************************")
print()
print(set(df[i].to_list()))
******************************* Species *******************************
{'Gentoo penguin (Pygoscelis papua)', 'Chinstrap penguin (Pygoscelis antarctica)', 'Adelie Penguin (Pygoscelis adeliae)'}
******************************* Region *******************************
{'Anvers'}
******************************* Island *******************************
{'Torgersen', 'Biscoe', 'Dream'}
******************************* Stage *******************************
{'Adult, 1 Egg Stage'}
******************************* Clutch Completion *******************************
{'No', 'Yes'}
******************************* Culmen Length (mm) *******************************
{49.4, 49.9, 37.3, 45.7, 50.4, 32.1, 33.1, 33.5, 35.6, 36.2, 35.7, 38.1, 35.2, 40.2, 41.5, 34.1, 34.6, 36.7, 37.8, 38.9, 39.5, 40.3, 42.0, 41.1, 42.5, 46.0, 44.1, 45.8, 46.5, 50.0, 51.3, 52.7, 51.7, 52.0, 58.0, 54.2, 55.8, 59.6, 47.7, 47.2, 48.2, 45.0, 50.2, nan, 43.8, 34.4, 34.5, 34.0, 35.9, 35.3, 35.0, 35.1, 35.5, 36.6, 36.4, 36.5, 36.0, 36.9, 37.7, 37.9, 37.2, 37.6, 37.0, 38.6, 38.7, 38.2, 38.8, 37.5, 39.1, 39.3, 39.2, 39.8, 39.6, 39.0, 39.7, 40.6, 40.5, 40.9, 40.8, 40.1, 41.4, 41.3, 41.6, 41.8, 41.0, 42.2, 42.3, 42.8, 42.1, 42.9, 43.2, 42.7, 43.1, 43.5, 43.3, 44.5, 44.4, 44.0, 45.6, 45.4, 45.2, 45.9, 45.5, 46.1, 46.6, 46.4, 46.7, 46.9, 47.0, 47.5, 47.6, 46.2, 46.8, 48.5, 48.1, 48.7, 48.4, 47.8, 49.2, 49.5, 49.7, 49.0, 49.8, 50.5, 50.3, 50.6, 50.9, 50.8, 51.0, 50.1, 51.5, 51.4, 50.7, 52.8, 52.2, 51.9, 47.3, 51.1, 53.5, 52.5, 52.1, 43.6, 53.4, 54.3, 55.9, 55.1, 48.8, 42.6, 49.3, 45.1, 45.3, 42.4, 43.4, 44.9, 48.6, 40.7, 49.6, 49.1, 41.7, 38.5, 46.3, 47.4, 38.3, 36.3, 36.8, nan}
******************************* Culmen Depth (mm) *******************************
{13.2, 14.1, 15.9, 16.7, 17.4, 18.4, 19.0, 20.7, 21.5, 18.7, 19.3, 18.0, 20.6, 17.8, 19.6, 20.2, 21.2, 21.1, 13.5, 14.5, 14.0, 15.5, 15.0, 16.0, 16.4, 16.9, 16.6, 16.1, 17.5, 17.1, 17.6, 17.9, 17.0, 18.1, 18.9, 18.6, 18.5, 19.1, 19.5, 19.4, 19.9, 20.0, 20.5, 20.1, 13.4, 13.9, 14.4, 14.9, 15.4, nan, 13.3, 13.8, 13.7, 14.3, 14.8, 14.2, 14.7, 15.2, 15.3, 15.7, 15.8, 16.2, 16.8, 16.3, 17.7, 17.3, 17.2, 18.3, 18.8, 18.2, 19.8, 19.2, 19.7, 20.3, 20.8, nan, 16.5, 13.6, 13.1, 14.6, 15.1, 15.6}
******************************* Flipper Length (mm) *******************************
{'172', '219', '.', '229', '181', '180', '205', '202', '228', '193', '231', '179', '174', '176', '220', '215', '184', '208', '191', '211', '201', '187', '206', '221', '225', '223', '209', '230', '194', '178', '200', '198', '189', '185', '213', '224', '182', '210', '222', '190', '226', '186', '197', '207', nan, '183', '188', '192', '199', '212', '218', '203', '195', '214', '217', '196', '216'}
******************************* Body Mass (g) *******************************
{'5000', '.', '3500', '3100', '3675', '5750', '4575', '4150', '5850', '4625', '3350', '4275', '2975', '3625', '2925', '3300', '5650', '4650', '3050', '3600', '4050', '5050', '3775', nan, '5800', '5300', '6000', '4850', '4775', '3850', '2900', '5550', '3325', '4600', '3700', '4000', '3450', '3875', '5450', '5150', '4900', '5500', '4975', '4500', '4375', '3275', '3200', '4100', '4400', '4750', '6300', '3150', '5700', '2850', '4725', '5600', '3075', '3000', '5200', '3175', '3475', '4075', '5950', '3725', '3975', '5250', '4450', '6050', '3825', '3800', '3900', '4950', '5400', '3400', '3650', '3750', '4700', '4675', '3550', '4300', '5350', '3525', '4875', '3425', '4250', '2700', '4925', '4475', '4350', '5100', '4200', '3250', '3575', '4550', '3950', '4800'}
******************************* Sex *******************************
{'.', 'FEMALE', 'MALE', nan}
******************************* Delta 15 N (o/oo) *******************************
{7.88863, 8.93465, 9.98044, 8.46894, 8.90027, 8.47781, 8.19539, 8.94956, 8.36821, 8.76651, 8.66496, 9.18718, 10.02019, 7.88494, 8.94332, 8.8964, 8.63243, 8.35396, 8.01979, 9.4606, 9.27158, 9.07826, 8.79665, 8.9846, 9.46929, 9.62357, 9.65061, 8.70642, 8.2066, 8.15426, 8.29671, 8.30515, 8.14756, 8.38404, 8.33524, 8.32718, 9.42666, 9.49283, 9.45827, 9.29808, 9.22537, 8.39299, 7.90436, 8.65914, 8.30231, 7.99184, 8.75984, 8.65015, 8.51951, 8.08138, 8.74802, 9.25769, 9.22286, 9.21292, 8.87988, 9.14863, 10.02372, 9.11066, 9.07825, 8.63551, nan, 8.27141, nan, 8.96472, 8.96436, 8.24651, 8.65803, 8.63259, nan, 8.58063, 8.27376, nan, 8.01485, 8.2345, 8.23468, 8.86495, 8.83352, 8.27102, 7.76843, 7.69778, 7.993, 8.24694, 8.24515, 8.88098, 8.64701, 8.58487, 8.14567, 8.9399, 9.07878, 8.37615, 8.27595, 8.29226, 8.12311, 8.41017, 8.74647, 8.93997, nan, 7.8962, 8.97025, 8.62623, 8.86853, 8.73762, 8.18658, 9.25177, 9.46819, 8.40327, 8.60447, 9.02657, 9.04296, 9.415, 8.45738, 7.79958, 8.24691, 8.59147, 8.30166, 7.8208, 7.96935, 8.81668, 8.68747, 8.47257, 8.49854, 8.47938, 9.56534, 9.00642, 9.32277, 9.59462, 9.05674, 9.36799, 9.06829, nan, 8.04111, 7.90971, 7.96621, 7.89744, 8.48367, 8.03659, 8.5964, 8.04787, 8.79787, 8.79581, 9.22033, 8.67538, 9.15308, 9.05736, 9.43684, 9.32169, 8.36701, 8.62264, 8.25818, 8.36936, 8.85664, 8.41151, 7.63884, 8.13643, 8.19579, 8.24253, 8.19749, 8.52566, 8.47829, 9.43146, 9.08458, 9.36493, 9.39305, 9.43782, 9.18021, 8.58319, 8.3639, 8.85699, 8.39459, 8.94365, 7.92358, 7.97408, 8.56192, 8.77322, 8.08354, nan, 7.9953, 7.84057, 8.55583, 8.43423, 8.57087, 8.97533, 8.83502, 9.41131, 9.01079, 9.23196, 9.32105, 8.1631, 9.2881, 9.59245, 9.26715, 9.70465, 8.15566, 8.16582, 8.20106, nan, 8.33825, nan, 8.48204, 8.717, 8.90002, 8.43951, 8.47067, 9.93727, 9.68933, 9.51929, 9.29078, 9.03935, 9.61734, 8.4207, nan, 8.12691, nan, 8.24246, 7.77672, 7.6322, 7.6887, 8.90723, 8.98705, 8.35802, 8.41837, 8.80967, 8.10417, 9.19031, 9.37369, 9.49645, 9.63954, 9.69756, 9.4618, 9.74144, 8.19101, 8.95063, 8.39867, 8.56708, 8.57199, 8.26548, 7.83733, 7.63452, 7.82381, 7.8881, 8.30817, 8.2554, 8.6287, 8.68744, 8.20042, 8.55868, 9.53262, 9.6895, 9.46985, 8.63604, 8.47827, 9.72764, 9.66523, 8.84451, 9.8059, 9.77528, 8.48095, 8.3118, 8.71078, 8.10231, 8.46531, nan, 8.77018, 8.32359, 8.14705, 8.47173, 8.65466, 8.92069, 8.45167, 9.13362, 9.35416, 9.74492, 9.11616, 9.18985, 9.51784, 9.36392, 9.2151, 9.75486, 8.2993, 8.51362, 7.68528, nan, nan, 8.27428, 8.61651, 8.14776, 8.80186, 8.28601, 9.18528, 9.10702, 9.88809, 9.80589, 9.50772, 9.49106, 9.31735, 9.0233, 9.35138, 8.64931, 8.78557, 8.46616, 8.21634, 8.35078, 8.10277, 7.96491, 8.3554, 8.63701, 8.13746, 8.22673, 8.38289, 9.11006, 9.28153, 9.79532, 9.12277, 8.66271, 9.17847, 9.108, 10.02544, 9.63074, 9.14382, 8.91434, 8.60092, 8.03624, 8.72037, 8.50153, 8.88942, 8.53018, 8.11238, 9.37608, 9.34089, 9.30722, 8.07137, 8.95998, 9.36668, 9.2372, 9.04218, 9.02642, 8.63488, 8.49662, 8.38324, 9.23408, 8.48789, 8.85562, 8.49915, 8.56674}
******************************* Delta 13 C (o/oo) *******************************
{-26.06594, -25.03469, -26.33867, -26.18466, -26.18444, -26.20372, -26.79093, -26.57585, -26.18599, -26.2253, -25.95399, -25.62618, -25.89677, -25.8306, -25.33302, -24.65859, -25.69199, -24.45721, -25.39587, -26.83006, -24.80526, -26.06943, -26.75621, -24.69638, -24.43062, -25.38157, -26.57563, -25.32829, -25.27385, -25.54456, -26.7489, -26.05621, -25.48025, -25.79203, -26.08165, -25.77951, -26.65931, -26.07021, -26.06967, -26.22664, nan, nan, -26.08547, -26.68867, -25.22664, -26.86127, -26.20538, -26.03679, -26.78733, -26.3433, -25.29805, nan, -26.01363, -26.32909, -26.59467, -26.65359, -26.22227, -25.23319, -25.5139, -26.79846, -25.73722, -26.7699, -24.61867, -24.805, -24.29229, -24.60882, -24.36088, -24.59467, -24.31198, -24.40753, -24.62717, -24.70615, nan, -26.11657, -25.85203, -26.41218, -25.77264, -25.95541, -26.23886, -25.80208, nan, nan, -25.88547, -26.05756, nan, -26.35425, -26.15531, -26.11244, nan, -26.76821, -26.03495, -26.06209, -26.63085, -26.38085, -25.81513, -25.3933, -25.4276, -25.81012, -25.49448, -24.90816, -24.52896, -25.19837, -24.52698, -25.11223, -24.59996, -24.6844, -24.55644, nan, -24.45195, -26.21019, nan, -26.84374, -26.13971, -26.27853, -26.70968, -25.32176, -26.42406, -26.57941, -25.54976, -25.6821, -24.4228, -26.44787, -25.14591, -26.1165, -26.38396, -25.97696, -26.19444, -25.52473, -26.84506, -25.01185, -26.01549, -26.84272, -26.5929, -26.84415, -26.37809, -26.30037, -25.21799, -25.57956, -26.30019, -25.37899, -25.37746, -24.26375, -24.59066, -24.6879, -24.34684, -24.25255, -24.89958, -26.24369, -25.06691, -26.63405, -26.13832, -25.0602, -25.46172, -25.39369, -26.09635, -25.36288, -24.53494, -26.2766, -25.79529, -26.17213, -26.60436, -24.66867, -24.48153, -25.89741, -24.78984, -26.06203, -25.89834, -24.10255, -26.18161, -26.22848, -26.09989, -26.61601, -26.55602, -26.46254, -26.0245, -25.22588, -26.79053, -25.50811, -26.55351, -26.23027, -24.69454, -24.7557, -24.49433, -24.41562, -25.04169, -24.48403, -24.64162, -24.54903, -26.60023, -26.18763, -25.40075, nan, -26.07081, nan, -26.39677, -26.69166, -25.23453, -25.46782, -25.46327, -25.42621, -25.07683, -26.89644, nan, -26.11046, -25.11609, -26.15775, -26.12417, -26.11199, -26.86352, -26.67799, -25.39181, -25.45171, -26.61414, -25.83352, -25.38017, -24.57994, -24.56481, -24.73735, -25.50562, -25.42826, -24.66259, -26.51382, -24.23592, -23.78767, -24.84059, -23.90309, -25.49523, -25.86482, -26.24837, -25.48383, -25.76147, -26.02002, -25.88156, -26.86485, -25.46569, -26.04726, -26.72751, -25.21315, -26.28055, -25.13993, -25.03474, -25.29856, -25.23061, -26.60484, -26.21569, -26.32601, -26.66958, -26.54718, -25.79189, -25.4168, -26.45978, -25.0102, -25.18543, -24.404, -25.01745, -25.61039, -25.00169, -24.64335, -24.97134, -24.47142, -24.66188, -24.31912, -24.86594, -26.07821, -26.01152, -25.53768, -25.69195, -25.09383, -26.16524, -26.3146, -26.40943, -26.84166, -26.74809, -26.61075, -26.44815, -25.12255, -26.38092, -25.96013, -26.9547, -26.27573, -24.77227, -26.42018, -25.09368, -26.07188, -25.52627, -26.04117, -26.21651, -26.74249, -25.19017, -26.69543, -26.50086, -26.11969, -25.32426, -25.69327, -26.38986, -25.0639, -26.78958, -26.68311, -26.7765, -26.1396, -26.09294, -25.1455, -25.60826, -25.57647, -25.88798, -25.79549, -24.7294, -24.35575, -24.65786, -24.38933, -25.03492, -24.30229, -24.17282, -24.68741, -24.36202, -24.3613, -26.15003, -26.49288, -26.61788, -26.71199, -26.79358, -24.16566, -24.59897, -26.23613, -25.98843, -24.38751, -26.12989, -27.01854, -26.5387, -26.36678, -24.54704, -24.59513, -26.72791, -26.42563, -25.70711, -26.03442, -25.3947, -26.8154, -24.45189, -24.90024, -23.89017, -26.48973, -26.36863, -26.70783}
[102]:
df['Flipper Length (mm)'].mode()
[102]:
0 190 Name: Flipper Length (mm), dtype: object
[103]:
df['Flipper Length (mm)'] = np.where(df['Flipper Length (mm)']=='.',
'190',
df['Flipper Length (mm)'])
[104]:
df['Body Mass (g)'].mode()
[104]:
0 3800 Name: Body Mass (g), dtype: object
[105]:
df['Body Mass (g)'] = np.where(df['Body Mass (g)']=='.',
'3800',
df['Body Mass (g)'])
[106]:
df['Sex'].mode()
[106]:
0 MALE Name: Sex, dtype: object
[107]:
df['Sex'] = np.where(df['Sex']=='.',
'MALE',
df['Sex'])
[108]:
for i in df.columns:
print("*******************************"
,i,"*******************************")
print()
print(set(df[i].to_list()))
******************************* Species *******************************
{'Gentoo penguin (Pygoscelis papua)', 'Chinstrap penguin (Pygoscelis antarctica)', 'Adelie Penguin (Pygoscelis adeliae)'}
******************************* Region *******************************
{'Anvers'}
******************************* Island *******************************
{'Torgersen', 'Biscoe', 'Dream'}
******************************* Stage *******************************
{'Adult, 1 Egg Stage'}
******************************* Clutch Completion *******************************
{'No', 'Yes'}
******************************* Culmen Length (mm) *******************************
{49.4, 49.9, 37.3, 45.7, 50.4, 32.1, 33.1, 33.5, 35.6, 36.2, 35.7, 38.1, 35.2, 40.2, 41.5, 34.1, 34.6, 36.7, 37.8, 38.9, 39.5, 40.3, 42.0, 41.1, 42.5, 46.0, 44.1, 45.8, nan, 50.0, 51.3, 52.7, 51.7, 52.0, 58.0, 54.2, 55.8, 59.6, 47.7, 47.2, 48.2, nan, 45.0, 50.2, 43.8, 34.4, 34.5, 34.0, 35.9, 35.3, 35.1, 35.0, 35.5, 36.6, 36.4, 36.5, 36.0, 36.9, 37.7, 37.9, 37.2, 37.6, 37.0, 38.6, 38.7, 38.2, 38.8, 37.5, 39.1, 39.3, 39.2, 39.8, 39.6, 39.0, 39.7, 40.6, 40.5, 40.9, 40.8, 40.1, 41.4, 41.3, 41.6, 41.8, 41.0, 42.2, 42.3, 42.8, 42.1, 42.9, 43.2, 42.7, 43.1, 43.5, 43.3, 44.5, 44.4, 44.0, 45.6, 45.4, 45.2, 45.9, 45.5, 46.5, 46.1, 46.6, 46.4, 46.7, 47.0, 47.5, 47.6, 46.9, 46.2, 48.5, 48.1, 48.7, 48.4, 47.8, 49.2, 49.5, 49.7, 49.0, 49.8, 50.5, 50.3, 50.6, 50.9, 50.8, 51.0, 50.1, 51.5, 51.4, 50.7, 52.8, 52.2, 51.9, 47.3, 51.1, 53.5, 52.5, 52.1, 43.6, 53.4, 54.3, 55.9, 55.1, 48.8, 42.6, 49.3, 45.1, 45.3, 42.4, 43.4, 44.9, 48.6, 40.7, 49.6, 49.1, 41.7, 38.5, 46.8, 46.3, 47.4, 38.3, 36.3, 36.8}
******************************* Culmen Depth (mm) *******************************
{13.2, 14.1, 15.9, 16.7, 17.4, 18.4, 19.0, 20.7, 21.1, 21.5, 18.7, 19.3, 18.0, 20.6, 19.6, 20.2, 21.2, 13.5, 14.5, 14.0, 15.5, 15.0, 16.0, 16.5, 16.4, 16.9, 16.6, 17.0, 17.5, 16.1, 17.1, 17.6, 18.1, 18.9, 18.6, 17.9, 18.5, 19.1, 19.5, 19.4, 19.9, 20.0, 20.5, 20.1, nan, 13.4, 13.9, 14.4, 14.9, 15.4, nan, 13.3, 13.8, 13.7, 14.3, 14.8, 14.2, 14.7, 15.2, 15.3, 15.7, 15.8, 16.2, 16.8, 16.3, 17.7, 17.8, 17.3, 17.2, 18.3, 18.8, 18.2, 19.8, 19.2, 19.7, 20.3, 20.8, 13.6, 13.1, 14.6, 15.1, 15.6}
******************************* Flipper Length (mm) *******************************
{'172', '219', '229', '181', '180', '205', '202', '228', '193', '231', '179', '174', '176', '220', '215', '184', '208', '191', '211', '201', '187', '206', '221', '225', '223', '209', '230', '194', '178', '200', '198', '189', '185', '213', '224', '182', '210', '222', '190', '226', '186', '197', '207', nan, '183', '188', '192', '199', '212', '218', '203', '195', '214', '217', '196', '216'}
******************************* Body Mass (g) *******************************
{'5000', '3500', '3100', '3675', '5750', '4575', '4150', '5850', '4625', '3350', '4275', '2975', '3625', '2925', '3300', '5650', '4650', '3050', '3600', '4050', '5050', '3775', nan, '5800', '5300', '6000', '4850', '4775', '3850', '2900', '5550', '3325', '4600', '3700', '4000', '3450', '3875', '5450', '5150', '4900', '5500', '4975', '4500', '4375', '3275', '3200', '4100', '4400', '4750', '6300', '3150', '5700', '2850', '4725', '5600', '3075', '3000', '5200', '3175', '3475', '4075', '5950', '3725', '3975', '5250', '4450', '6050', '3825', '3800', '3900', '4950', '5400', '3400', '3650', '3750', '4700', '4675', '3550', '4300', '5350', '3525', '4875', '3425', '4250', '2700', '4925', '4475', '4350', '5100', '4200', '3250', '3575', '4550', '3950', '4800'}
******************************* Sex *******************************
{'FEMALE', 'MALE', nan}
******************************* Delta 15 N (o/oo) *******************************
{7.88863, 8.93465, 9.98044, 8.46894, 8.90027, 8.47781, 8.19539, 8.67538, 8.94956, 8.36821, 8.76651, 9.18718, 10.02019, 7.88494, 8.94332, 8.8964, 8.63243, 8.35396, 8.01979, 9.4606, 9.27158, 9.07826, 8.79665, 8.9846, 9.46929, 9.62357, 9.65061, 8.70642, 8.2066, 8.15426, 8.29671, 8.30515, 8.14756, 8.38404, 8.33524, 8.32718, 9.42666, 9.22537, 9.45827, 9.29808, 8.39299, 9.49283, 7.90436, 8.65914, 8.30231, 7.99184, 8.75984, 8.65015, 8.51951, 8.08138, 8.74802, 9.14863, 9.25769, 9.22286, 9.21292, 8.87988, 10.02372, 9.11066, nan, 8.63551, 9.07825, 8.27141, 8.96472, 8.96436, 8.24651, 8.65803, 8.63259, 8.58063, nan, 8.27376, nan, 8.01485, 8.2345, 8.23468, 8.86495, 8.83352, 8.27102, 7.76843, 7.69778, 7.993, 8.24694, 8.24515, 8.88098, 8.64701, 8.58487, 8.14567, 8.9399, 9.07878, 8.37615, 8.27595, 8.29226, 8.12311, 8.41017, nan, 8.74647, 8.93997, nan, nan, 7.8962, 8.97025, 8.62623, 8.86853, 8.73762, 8.18658, 9.25177, 9.46819, 8.40327, 8.60447, 9.02657, 9.04296, 9.415, 8.45738, nan, 7.79958, 8.24691, 8.59147, 8.30166, nan, 7.8208, 7.96935, 8.81668, 8.68747, 8.47257, 8.49854, 8.47938, 9.56534, 9.00642, 9.32277, 9.59462, 9.05674, 9.36799, 9.06829, 8.04111, 7.90971, 7.96621, 7.89744, 8.03659, 8.5964, 8.79787, 8.79581, 8.04787, 9.22033, 9.15308, 9.05736, 9.43684, 8.48367, 9.32169, 8.36701, 8.62264, 8.25818, 8.45167, 8.36936, 8.85664, 8.41151, 7.63884, 8.13643, 8.19579, 8.24253, 8.19749, 8.52566, 8.47829, 9.43146, 9.08458, 9.36493, 9.39305, 9.43782, 9.18021, 8.58319, 8.3639, 8.85699, 8.39459, 8.94365, 7.92358, 7.97408, 8.56192, 8.77322, 8.08354, 7.9953, 7.84057, 8.55583, 8.43423, 8.57087, 8.97533, 8.83502, 9.41131, 9.01079, 9.23196, 9.32105, 8.1631, 9.2881, 9.59245, 9.26715, 9.70465, 8.15566, 8.16582, 8.20106, 8.33825, 8.48204, 8.717, 8.90002, 8.43951, 8.47067, 9.93727, 9.68933, 9.51929, 9.29078, 9.03935, 9.61734, 8.4207, 8.12691, nan, 8.24246, 7.77672, 7.6322, 7.6887, 8.90723, 8.98705, 8.35802, 8.41837, 8.80967, 8.10417, 9.19031, 9.37369, 9.49645, 9.63954, 9.69756, 9.4618, 9.74144, 8.19101, 8.95063, nan, 8.39867, 8.56708, 8.57199, 8.26548, 7.83733, 7.63452, 7.82381, 7.8881, 8.30817, 8.2554, 8.6287, 8.68744, 8.20042, 8.55868, 9.53262, 9.6895, 9.46985, 8.63604, 8.47827, 9.72764, 9.66523, 8.84451, 9.8059, 9.77528, 8.48095, 8.3118, 8.71078, 8.10231, 8.46531, 8.77018, nan, nan, 8.32359, 8.66496, 8.14705, 8.47173, 8.65466, 8.92069, 9.13362, 9.35416, 9.74492, 9.18985, 9.51784, 9.11616, 9.36392, 9.2151, 9.75486, 8.2993, 8.51362, nan, 7.68528, nan, 8.27428, 8.61651, 8.14776, 8.80186, 8.28601, 9.10702, 9.88809, 9.80589, 9.18528, 9.50772, 9.49106, 9.31735, 9.0233, 9.35138, 8.64931, 8.78557, 8.46616, 8.21634, 8.35078, 8.10277, 7.96491, 8.3554, 8.63701, 8.13746, 8.22673, 8.38289, 9.11006, 9.28153, 9.79532, 9.12277, 8.66271, 9.17847, 9.108, 10.02544, 9.63074, 9.14382, 8.91434, 8.60092, 8.03624, 8.72037, 8.50153, 8.88942, 8.53018, 8.11238, 9.37608, 9.34089, 9.30722, 8.07137, 8.95998, 9.36668, 9.2372, 9.04218, 9.02642, 8.63488, 8.49662, 8.38324, 9.23408, 8.48789, 8.85562, 8.49915, 8.56674}
******************************* Delta 13 C (o/oo) *******************************
{-26.06594, -25.03469, -26.33867, -26.18466, -26.18444, -26.20372, -26.79093, -26.57585, -26.2253, -26.39677, -25.95399, -25.62618, -25.89677, -25.8306, -25.33302, -24.65859, -25.69199, -24.45721, -25.39587, -26.83006, -24.80526, -26.06943, -26.75621, -24.69638, -24.43062, -25.38157, -26.57563, -25.32829, -25.27385, -25.54456, -26.7489, -26.05621, -25.48025, -25.79203, -26.08165, nan, -25.77951, -26.65931, -26.07021, -26.06967, -26.22664, -26.08547, -26.68867, -25.22664, -26.86127, nan, -26.20538, -26.03679, -26.78733, -26.3433, -25.29805, -26.01363, -26.32909, -26.59467, -26.65359, -26.22227, -25.5139, -25.09368, -26.79846, -25.73722, -26.7699, -24.61867, -24.805, -24.29229, -24.60882, -24.36088, -24.59467, -24.31198, -24.40753, -24.62717, -24.70615, nan, -26.11657, -25.85203, -26.41218, -25.77264, nan, -25.95541, nan, -26.23886, -25.80208, -26.18599, nan, -25.88547, -26.05756, -26.35425, -26.15531, -26.11244, nan, -26.76821, -26.03495, -26.06209, -26.63085, -26.38085, -25.81513, -25.3933, -25.4276, -25.81012, -25.49448, -24.90816, -24.52896, -25.19837, -25.11223, -24.52698, nan, -24.59996, -24.6844, -24.55644, -24.45195, -26.21019, nan, -26.84374, -26.13971, -26.27853, -26.70968, -25.32176, -26.42406, -26.57941, -25.54976, -25.6821, -24.4228, -26.44787, -25.14591, -26.1165, -26.38396, -25.97696, -26.19444, -25.52473, -26.84506, -25.01185, -26.01549, -26.84272, -26.5929, -26.84415, -26.37809, -26.30037, -25.21799, -25.57956, -26.30019, -25.37899, -25.37746, -24.26375, -24.59066, -24.6879, -24.34684, -24.25255, -24.89958, -26.24369, -25.06691, -26.63405, -26.13832, -25.0602, -25.46172, -25.39369, -26.09635, -25.36288, -24.53494, -26.2766, -25.79529, -26.17213, -26.60436, -24.66867, -24.48153, -25.89741, -26.06203, -25.89834, -24.10255, -26.18161, -26.22848, -26.09989, -26.61601, -26.55602, -26.46254, -26.0245, -25.22588, -26.79053, -25.50811, -26.55351, -26.23027, -24.69454, -24.7557, -24.49433, -24.41562, -25.04169, -24.48403, -24.64162, -24.54903, -26.60023, -26.18763, -25.40075, -26.07081, -26.69166, -25.23453, -25.46782, -25.46327, nan, -25.42621, nan, -25.07683, -26.89644, -26.11046, -25.11609, -26.15775, -26.12417, -26.11199, -26.86352, -26.67799, -25.39181, -25.45171, -26.61414, -25.83352, -25.38017, -24.57994, -24.56481, -24.73735, -25.50562, -25.42826, -24.66259, -26.51382, -24.23592, -23.78767, -24.84059, -23.90309, -25.49523, -25.86482, -26.24837, -25.48383, -25.76147, -26.02002, -25.88156, -26.86485, -25.46569, -26.04726, -26.72751, -25.21315, -26.28055, -25.13993, -25.0102, -25.03474, -25.29856, -25.23061, -26.60484, -26.21569, -26.32601, -26.66958, -26.54718, -25.79189, -25.4168, -26.45978, -25.18543, -25.01745, -24.404, -25.61039, -25.00169, -24.64335, -24.97134, -24.47142, -24.66188, -24.31912, -24.86594, -24.78984, -26.07821, -26.01152, -25.53768, -25.69195, -25.09383, -26.16524, -26.3146, -26.40943, nan, -26.84166, -26.74809, -25.23319, -26.61075, -26.44815, -25.12255, -26.38092, -25.96013, -26.9547, -26.27573, -24.77227, -26.42018, -26.07188, -25.52627, -26.04117, -26.21651, -26.74249, -25.19017, -26.69543, nan, -26.50086, -26.11969, -25.32426, -25.69327, -26.38986, -25.0639, -26.78958, -26.68311, -26.7765, -26.1396, -26.09294, -25.1455, -25.60826, -25.57647, -25.88798, -25.79549, -24.7294, -24.35575, -24.65786, -24.38933, -25.03492, -24.30229, -24.17282, -24.68741, -24.36202, -24.3613, -26.15003, -26.49288, -26.61788, -26.71199, -26.79358, -24.16566, -24.59897, -26.23613, -25.98843, -24.38751, -26.12989, -27.01854, -26.5387, -26.36678, -24.54704, -24.59513, -26.72791, -26.42563, -25.70711, -26.03442, -25.3947, -26.8154, -24.45189, -24.90024, -23.89017, -26.48973, -26.36863, -26.70783}
[109]:
df['Flipper Length (mm)'] = pd.to_numeric(df['Flipper Length (mm)'])
[110]:
df['Body Mass (g)'] = pd.to_numeric(df['Body Mass (g)'])
[111]:
for i in df.columns:
print("*******************************"
,i,"*******************************")
print()
print(set(df[i].to_list()))
******************************* Species *******************************
{'Gentoo penguin (Pygoscelis papua)', 'Chinstrap penguin (Pygoscelis antarctica)', 'Adelie Penguin (Pygoscelis adeliae)'}
******************************* Region *******************************
{'Anvers'}
******************************* Island *******************************
{'Torgersen', 'Biscoe', 'Dream'}
******************************* Stage *******************************
{'Adult, 1 Egg Stage'}
******************************* Clutch Completion *******************************
{'No', 'Yes'}
******************************* Culmen Length (mm) *******************************
{49.4, 49.9, nan, 45.7, 50.4, 32.1, 33.1, 33.5, 35.6, 36.2, 35.7, 38.1, 35.2, 40.2, 41.5, 34.1, 34.6, 36.7, 37.8, 38.7, 38.9, 39.5, 40.3, 42.0, 42.5, 46.0, 44.1, 45.8, 46.5, 50.0, 51.3, 52.7, 51.7, 52.0, 58.0, 54.2, 55.8, 59.6, 47.7, 47.2, 44.0, 48.2, 45.0, nan, 50.2, 43.8, 34.4, 34.5, 34.0, 35.9, 35.3, 35.0, 35.1, 35.5, 36.6, 36.4, 36.5, 36.0, 36.9, 37.7, 37.9, 37.2, 37.6, 37.0, 38.6, 38.2, 38.8, 37.5, 37.3, 39.1, 39.3, 39.2, 39.8, 39.6, 39.0, 39.7, 40.6, 40.5, 40.9, 41.1, 40.8, 40.1, 41.4, 41.3, 41.6, 41.8, 41.0, 42.2, 42.3, 42.8, 42.1, 42.9, 43.2, 42.7, 43.1, 43.5, 43.3, 44.5, 44.4, 45.6, 45.4, 45.2, 45.9, 45.5, 46.1, 46.6, 46.4, 46.7, 46.9, 47.0, 47.5, 47.6, 46.2, 46.8, 48.5, 48.1, 48.7, 48.4, 47.8, 49.2, 49.5, 49.7, 49.0, 49.8, 50.5, 50.3, 50.6, 50.9, 50.8, 51.0, 50.1, 51.5, 51.4, 50.7, 52.8, 52.2, 51.9, 47.3, 51.1, 53.5, 52.5, 52.1, 43.6, 53.4, 54.3, 55.9, 55.1, 48.8, 42.6, 49.3, 45.1, 45.3, 42.4, 43.4, 44.9, 48.6, 40.7, 49.6, 49.1, 41.7, 38.5, 46.3, 47.4, 38.3, 36.3, 36.8}
******************************* Culmen Depth (mm) *******************************
{13.2, 14.1, 15.9, 16.7, 17.4, 18.4, 19.0, 20.7, 21.5, 18.7, 19.3, 18.0, 20.6, 17.8, 19.6, 20.2, 21.2, 21.1, 13.5, 14.5, 14.0, 15.5, 15.0, 16.0, 16.4, 16.9, 16.6, 16.1, 17.5, 17.1, 17.6, 17.9, 17.0, 18.1, 18.9, 18.6, 18.5, 19.1, 19.5, 19.4, 19.9, 20.0, 20.5, 20.1, nan, 13.4, 13.9, 14.4, 14.9, 15.4, 13.3, 13.8, 13.7, 14.3, 14.8, 14.2, 14.7, 15.2, 15.3, 15.7, 15.8, 16.2, 16.8, 16.3, 17.7, 17.3, 17.2, 18.8, 18.3, 18.2, 19.8, 19.2, 19.7, 20.3, 20.8, 16.5, 13.6, 13.1, 14.6, 15.1, 15.6, nan}
******************************* Flipper Length (mm) *******************************
{172.0, 174.0, 176.0, 178.0, 179.0, 180.0, 181.0, 182.0, 183.0, 184.0, 185.0, 186.0, 187.0, 188.0, 189.0, 190.0, 191.0, 192.0, 193.0, 194.0, 195.0, 196.0, 197.0, 198.0, 199.0, 200.0, nan, 202.0, 203.0, 201.0, 205.0, 206.0, 207.0, 208.0, 209.0, 210.0, 211.0, 212.0, 213.0, 214.0, 215.0, 216.0, 217.0, 218.0, 219.0, 220.0, 221.0, 222.0, 223.0, 224.0, 225.0, 226.0, 228.0, 229.0, 230.0, 231.0, nan}
******************************* Body Mass (g) *******************************
{3075.0, 4100.0, nan, nan, 3600.0, 4625.0, 5650.0, 3100.0, 5150.0, 3625.0, 4650.0, 4150.0, 3650.0, 4675.0, 5700.0, 3150.0, 5200.0, 3675.0, 4700.0, 3175.0, 4200.0, 3700.0, 4725.0, 5750.0, 3200.0, 5250.0, 2700.0, 3725.0, 4750.0, 4250.0, 6300.0, 3750.0, 4775.0, 5800.0, 3250.0, 4275.0, 5300.0, 3775.0, 4800.0, 3275.0, 4300.0, 3800.0, 5850.0, 3300.0, 5350.0, 3825.0, 4850.0, 3325.0, 4350.0, 3850.0, 4875.0, 3350.0, 4375.0, 5400.0, 2850.0, 3875.0, 4900.0, 4400.0, 3900.0, 4925.0, 5950.0, 3400.0, 5450.0, 2900.0, 4950.0, 3425.0, 4450.0, 2925.0, 3950.0, 4975.0, 6000.0, 3450.0, 4475.0, 5500.0, 3975.0, 5000.0, 3475.0, 4500.0, 2975.0, 4000.0, 6050.0, 3500.0, 5550.0, 3000.0, 5050.0, 3525.0, 4550.0, 4050.0, 3550.0, 4575.0, 5600.0, 3050.0, 4075.0, 5100.0, 3575.0, 4600.0}
******************************* Sex *******************************
{'FEMALE', 'MALE', nan}
******************************* Delta 15 N (o/oo) *******************************
{7.88494, 8.93465, 9.98044, 8.46894, 8.90027, 8.47781, 8.19539, 8.67538, 8.94956, 8.36821, 8.76651, 9.18718, 10.02019, 8.94332, 8.8964, 8.63243, 8.35396, 8.01979, 9.4606, 9.27158, 9.07826, 8.79665, 8.9846, 9.46929, 9.62357, 9.65061, 8.70642, 8.2066, 8.15426, 8.29671, 8.30515, 8.14756, 8.38404, 8.33524, 8.32718, 9.42666, 9.49283, 9.22537, 9.45827, 9.29808, 8.39299, 7.90436, 8.65914, 8.30231, 7.99184, 8.75984, 8.65015, 8.51951, 8.08138, 8.74802, 9.25769, 9.22286, 9.21292, 8.87988, 9.14863, 10.02372, 9.11066, 9.07825, 8.63551, 8.27141, 8.96472, 8.96436, 8.24651, nan, 8.65803, 8.63259, 8.58063, 8.27376, nan, 8.01485, 8.2345, 8.23468, 8.86495, 8.83352, 8.27102, 7.76843, 7.69778, 7.993, 8.24694, 8.24515, 8.88098, 8.64701, 8.58487, 8.14567, 8.9399, 9.07878, 8.37615, 8.27595, nan, 8.29226, 8.12311, 8.41017, 8.74647, nan, 8.93997, 7.8962, 8.97025, 8.62623, 8.86853, 8.73762, 8.18658, 9.25177, 9.46819, 8.40327, 8.60447, 9.02657, 9.04296, 9.415, 8.45738, 7.79958, 8.24691, 8.59147, nan, 8.30166, nan, 7.8208, 7.96935, 8.81668, 8.68747, 8.47257, 8.49854, 8.47938, 9.56534, 9.00642, 9.32277, 9.59462, 9.05674, 9.36799, 9.06829, 8.04111, 7.90971, 7.96621, 7.89744, 8.03659, 8.5964, 8.79787, 8.79581, 8.04787, 9.22033, 9.15308, 9.05736, 9.43684, 8.48367, 9.32169, 8.36701, 8.62264, 8.25818, 8.45167, 8.36936, 8.85664, 8.41151, 7.63884, 8.13643, 8.19579, 8.24253, 8.19749, 8.52566, 8.47829, 9.43146, 9.18021, 9.08458, 9.36493, 9.39305, 9.43782, 8.58319, 8.3639, 8.85699, 9.2151, 8.39459, 8.94365, 7.92358, 7.97408, 8.56192, 8.77322, 8.08354, 7.9953, 7.84057, 8.55583, 8.43423, 8.57087, 8.97533, 8.83502, 9.41131, 9.23196, 9.01079, 9.32105, 8.1631, 9.2881, 9.59245, 9.26715, 9.70465, 8.15566, 8.16582, 8.20106, 8.33825, 8.48204, 8.717, 8.90002, 8.43951, 8.47067, 9.93727, 9.68933, 9.51929, 9.29078, 9.03935, 9.61734, 8.4207, 8.12691, 8.24246, 7.77672, 7.6322, 7.6887, nan, 8.90723, 8.98705, 8.35802, 8.41837, 8.80967, 8.10417, 9.19031, 9.37369, 9.49645, 9.63954, 9.69756, 9.4618, 9.74144, 8.19101, nan, 8.95063, nan, 8.39867, 8.56708, 8.57199, 8.26548, 7.83733, 7.63452, 7.82381, 7.8881, 8.30817, 8.2554, 8.6287, 8.68744, 8.20042, 8.55868, 9.53262, 9.6895, 9.46985, 8.63604, 8.47827, 9.72764, 9.66523, 8.84451, 9.8059, 9.77528, 8.48095, 8.3118, 8.71078, 8.10231, 8.46531, 8.77018, nan, nan, 8.32359, 8.66496, 8.14705, 8.47173, 8.65466, 8.92069, 9.13362, 9.35416, 9.74492, 9.18985, 9.51784, 9.11616, nan, nan, 9.36392, 9.75486, 8.2993, 8.51362, 7.88863, 7.68528, nan, 8.27428, 8.61651, 8.14776, 8.80186, 8.28601, 9.10702, 9.88809, 9.80589, 9.18528, 9.50772, 9.49106, 9.31735, 9.0233, 9.35138, 8.64931, 8.78557, 8.46616, 8.21634, 8.35078, 8.10277, 7.96491, 8.3554, 8.63701, 8.13746, 8.22673, 8.38289, 9.11006, 9.28153, 9.79532, 9.12277, 8.66271, 9.17847, 9.108, 10.02544, 9.63074, 9.14382, 8.91434, 8.60092, 8.03624, 8.72037, 8.50153, 8.88942, 8.53018, 8.11238, 9.37608, 9.34089, 9.30722, 8.07137, 8.95998, 9.36668, 9.2372, 9.04218, 9.02642, 8.63488, 8.49662, 8.38324, 9.23408, 8.48789, 8.85562, 8.49915, 8.56674}
******************************* Delta 13 C (o/oo) *******************************
{-26.06594, -25.03469, -26.33867, -26.18466, -26.18444, -26.20372, -26.79093, -26.57585, -26.2253, -26.39677, -25.95399, -25.62618, -25.89677, -25.8306, -25.33302, -24.65859, -25.69199, -24.45721, -25.39587, -26.83006, -24.80526, -26.06943, -26.75621, -24.69638, -24.43062, nan, -25.38157, -25.42826, -26.57563, -25.32829, -25.27385, -25.54456, -26.7489, -26.05621, -25.48025, -25.79203, -26.08165, -25.77951, -26.65931, -26.07021, -26.06967, -26.22664, -26.08547, nan, -26.68867, -25.22664, -26.86127, -26.20538, -26.78733, -26.03679, -26.3433, -25.29805, -26.01363, -26.32909, -26.59467, -26.65359, -26.22227, -25.23319, -25.5139, -25.09368, -26.79846, -25.73722, -24.61867, -24.805, -24.29229, -24.60882, -24.36088, -24.59467, -24.31198, -24.40753, -24.62717, -24.70615, -26.11657, -25.85203, nan, -26.41218, -25.77264, -25.95541, nan, -26.23886, nan, -25.80208, -26.18599, nan, -25.88547, nan, nan, nan, -26.05756, nan, -26.35425, -26.15531, -26.11244, -26.76821, -26.03495, -26.06209, -26.63085, -26.38085, -25.81513, -25.3933, -25.4276, -25.49448, -25.19837, -24.90816, -24.52896, -24.52698, -25.11223, -25.0602, -24.59996, -24.6844, -24.55644, -24.45195, -26.21019, nan, -26.84374, -26.13971, -26.27853, -26.70968, -25.32176, -26.42406, -26.57941, -25.54976, -25.6821, -24.4228, -26.44787, -25.14591, -26.1165, -26.38396, -25.97696, -26.19444, -25.52473, -26.84506, -25.01185, -26.7699, -26.01549, -26.84272, -26.5929, -26.84415, -26.37809, -26.30037, -25.21799, -25.57956, -26.30019, -25.37899, -25.37746, -24.26375, -24.59066, -24.6879, -24.34684, -24.25255, -24.89958, -26.24369, -25.06691, -26.63405, -26.13832, -25.46172, -25.39369, -26.09635, -24.53494, -26.2766, -25.79529, -26.17213, -26.60436, -24.66867, -24.48153, -25.89741, -24.78984, -26.06203, -25.89834, -24.10255, -26.18161, -26.22848, -26.09989, -26.61601, -26.55602, -26.46254, -26.0245, -26.79053, -25.50811, -26.55351, -25.36288, -26.23027, -24.69454, -24.7557, -24.49433, -24.41562, -25.04169, -24.48403, -24.64162, -24.54903, -26.60023, -26.18763, -25.40075, -26.07081, -26.69166, -25.23453, -25.46782, -25.46327, nan, -25.42621, -25.07683, -26.89644, -26.11046, -25.81012, nan, -25.11609, -26.15775, -26.12417, -26.11199, -26.86352, -26.67799, -25.39181, -25.45171, -26.61414, -25.83352, -25.38017, -24.23592, -24.57994, -24.56481, -24.73735, -25.50562, -24.66259, -26.51382, -23.90309, -23.78767, -24.84059, -25.49523, -25.86482, -26.24837, -25.48383, -25.76147, -26.02002, -25.88156, -26.86485, -25.46569, -26.04726, -26.72751, -25.21315, -26.28055, -25.13993, -25.03474, -25.29856, -25.23061, -25.22588, -26.60484, -26.21569, -26.32601, -26.66958, -26.54718, -25.79189, -25.4168, -26.45978, -25.0102, -25.18543, -24.404, -25.01745, -25.61039, -25.00169, -24.64335, -24.97134, -24.47142, -24.66188, -24.31912, -24.86594, -26.07821, -26.01152, -25.53768, -25.69195, -25.09383, -26.16524, -26.3146, -26.40943, -26.84166, -26.74809, -26.61075, -26.44815, -25.12255, -26.38092, -25.96013, -26.9547, -26.27573, -24.77227, -26.42018, -26.07188, -25.52627, -26.04117, -26.21651, -26.74249, -25.19017, -26.69543, -26.50086, -26.11969, -25.32426, -25.69327, -26.38986, -25.0639, -26.78958, -26.68311, -26.7765, -26.1396, -26.09294, -25.1455, -25.60826, -25.57647, -25.88798, -25.79549, -24.7294, -24.35575, -24.65786, -24.38933, -25.03492, -24.30229, -24.17282, -24.68741, -24.36202, -24.3613, -26.15003, -26.49288, -26.61788, -26.71199, -26.79358, -24.16566, -24.59897, -26.23613, -25.98843, -24.38751, -26.12989, -27.01854, -26.5387, -26.36678, -24.54704, -24.59513, -26.72791, -26.42563, -25.70711, -26.03442, -25.3947, -26.8154, -24.45189, -24.90024, -23.89017, -26.48973, -26.36863, -26.70783}
[112]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 344 entries, 0 to 343 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Species 344 non-null object 1 Region 344 non-null object 2 Island 344 non-null object 3 Stage 344 non-null object 4 Clutch Completion 344 non-null object 5 Culmen Length (mm) 342 non-null float64 6 Culmen Depth (mm) 342 non-null float64 7 Flipper Length (mm) 342 non-null float64 8 Body Mass (g) 342 non-null float64 9 Sex 334 non-null object 10 Delta 15 N (o/oo) 330 non-null float64 11 Delta 13 C (o/oo) 331 non-null float64 dtypes: float64(6), object(6) memory usage: 32.4+ KB
[113]:
df.isnull().sum()
[113]:
Species 0 Region 0 Island 0 Stage 0 Clutch Completion 0 Culmen Length (mm) 2 Culmen Depth (mm) 2 Flipper Length (mm) 2 Body Mass (g) 2 Sex 10 Delta 15 N (o/oo) 14 Delta 13 C (o/oo) 13 dtype: int64
[114]:
df.columns
[114]:
Index(['Species', 'Region', 'Island', 'Stage', 'Clutch Completion',
'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)',
'Body Mass (g)', 'Sex', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)'],
dtype='object')[115]:
#Numerical column --> 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)','Body Mass (g)', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)'
#Char Variable --> 'Sex',
[116]:
df['Sex'].value_counts()
[116]:
Sex MALE 169 FEMALE 165 Name: count, dtype: int64
[117]:
df['Sex'] = df['Sex'].fillna('MALE')
[118]:
#Splitting the data into train and test before doing Simple_Imputer
#median is not influenced by outlier
#so impute all 6 variable at one go rather than one by one
# x = df.iloc[:,1:]
# y= df[['Species']]
numerical = ['Culmen Length (mm)','Culmen Depth (mm)','Flipper Length (mm)','Body Mass (g)','Delta 15 N (o/oo)','Delta 13 C (o/oo)']
categorical = ['Species', 'Region', 'Island', 'Stage', 'Clutch Completion','Sex']
[119]:
y.head()
[119]:
| Species | |
|---|---|
| 0 | Adelie Penguin (Pygoscelis adeliae) |
| 1 | Adelie Penguin (Pygoscelis adeliae) |
| 2 | Adelie Penguin (Pygoscelis adeliae) |
| 3 | Adelie Penguin (Pygoscelis adeliae) |
| 4 | Adelie Penguin (Pygoscelis adeliae) |
[120]:
Selection deleted
df['Species'].value_counts()
[120]:
Species Adelie Penguin (Pygoscelis adeliae) 152 Gentoo penguin (Pygoscelis papua) 124 Chinstrap penguin (Pygoscelis antarctica) 68 Name: count, dtype: int64
[121]:
# df['Culmen Length (mm)'] = df['Culmen Length (mm)'].fillna(df['Culmen Length (mm)'].median())
# df['Culmen Depth (mm)'] = df['Culmen Depth (mm)'].fillna(df['Culmen Depth (mm)'].median())
# df['Flipper Length (mm)'] = df['Flipper Length (mm)'].fillna(df['Flipper Length (mm)'].median())
# df['Body Mass (g)'] = df['Body Mass (g)'].fillna(df['Body Mass (g)'].median())
# df['Delta 15 N (o/oo)'] = df['Delta 15 N (o/oo)'].fillna(df['Delta 15 N (o/oo)'].median())
# df['Delta 13 C (o/oo)'] = df['Delta 13 C (o/oo)'].fillna(df['Delta 13 C (o/oo)'].median())
[122]:
Selection deleted
from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values = np.nan , strategy='median')
df[numerical] = impute.fit_transform(df[numerical])
"""
Char imputation by mode concept
impute = SimpleImputer(missing_values = np.nan , strategy='most_frequent')
df[categorical] = impute.fit_transform(df[categorical])
"""
[122]:
"\nChar imputation by mode concept\nimpute = SimpleImputer(missing_values = np.nan , strategy='most_frequent')\ndf[categorical] = impute.fit_transform(df[categorical])\n"
[123]:
Selection deleted
df.isnull().sum()
[123]:
Species 0 Region 0 Island 0 Stage 0 Clutch Completion 0 Culmen Length (mm) 0 Culmen Depth (mm) 0 Flipper Length (mm) 0 Body Mass (g) 0 Sex 0 Delta 15 N (o/oo) 0 Delta 13 C (o/oo) 0 dtype: int64
[124]:
#Preprocessing part 2
df.head()
[124]:
| Species | Region | Island | Stage | Clutch Completion | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 39.10 | 18.7 | 181.0 | 3800.0 | MALE | 8.652405 | -25.83352 |
| 1 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 39.50 | 17.4 | 186.0 | 3800.0 | FEMALE | 8.949560 | -24.69454 |
| 2 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 40.30 | 18.0 | 190.0 | 3250.0 | FEMALE | 8.368210 | -25.33302 |
| 3 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 44.45 | 17.3 | 197.0 | 4050.0 | MALE | 8.652405 | -25.83352 |
| 4 | Adelie Penguin (Pygoscelis adeliae) | Anvers | Torgersen | Adult, 1 Egg Stage | Yes | 36.70 | 19.3 | 193.0 | 3450.0 | FEMALE | 8.766510 | -25.32426 |
# Encoding
Encoding¶
[125]:
# Decision Tree I need to go with Label Encoder
# Linear Regression I need to go with OHE
[126]:
# df['Species'] =
[127]:
Selection deleted
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Species'] = encoder.fit_transform(df['Species'])
df['Region'] = encoder.fit_transform(df['Region'])
df['Island'] = encoder.fit_transform(df['Island'])
df['Stage'] = encoder.fit_transform(df['Stage'])
df['Clutch Completion'] = encoder.fit_transform(df['Clutch Completion'])
df['Sex'] = encoder.fit_transform(df['Sex'])
[128]:
df.head()
[128]:
| Species | Region | Island | Stage | Clutch Completion | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 2 | 0 | 1 | 39.10 | 18.7 | 181.0 | 3800.0 | 1 | 8.652405 | -25.83352 |
| 1 | 0 | 0 | 2 | 0 | 1 | 39.50 | 17.4 | 186.0 | 3800.0 | 0 | 8.949560 | -24.69454 |
| 2 | 0 | 0 | 2 | 0 | 1 | 40.30 | 18.0 | 190.0 | 3250.0 | 0 | 8.368210 | -25.33302 |
| 3 | 0 | 0 | 2 | 0 | 1 | 44.45 | 17.3 | 197.0 | 4050.0 | 1 | 8.652405 | -25.83352 |
| 4 | 0 | 0 | 2 | 0 | 1 | 36.70 | 19.3 | 193.0 | 3450.0 | 0 | 8.766510 | -25.32426 |
[129]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 344 entries, 0 to 343 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Species 344 non-null int32 1 Region 344 non-null int32 2 Island 344 non-null int32 3 Stage 344 non-null int32 4 Clutch Completion 344 non-null int32 5 Culmen Length (mm) 344 non-null float64 6 Culmen Depth (mm) 344 non-null float64 7 Flipper Length (mm) 344 non-null float64 8 Body Mass (g) 344 non-null float64 9 Sex 344 non-null int32 10 Delta 15 N (o/oo) 344 non-null float64 11 Delta 13 C (o/oo) 344 non-null float64 dtypes: float64(6), int32(6) memory usage: 24.3 KB
# Outlier
Outlier¶
[130]:
#tree grow larger then you have to handle it
#or you can determing by feature importance part
[131]:
df.describe()
[131]:
| Species | Region | Island | Stage | Clutch Completion | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 344.000000 | 344.0 | 344.000000 | 344.0 | 344.000000 | 344.000000 | 344.000000 | 344.000000 | 344.000000 | 344.000000 | 344.000000 | 344.000000 |
| mean | 0.918605 | 0.0 | 0.662791 | 0.0 | 0.895349 | 43.925000 | 17.152035 | 200.877907 | 4201.017442 | 0.520349 | 8.730086 | -25.691855 |
| std | 0.893320 | 0.0 | 0.726194 | 0.0 | 0.306549 | 5.443792 | 1.969060 | 14.032539 | 799.618885 | 0.500313 | 0.540630 | 0.779277 |
| min | 0.000000 | 0.0 | 0.000000 | 0.0 | 0.000000 | 32.100000 | 13.100000 | 172.000000 | 2700.000000 | 0.000000 | 7.632200 | -27.018540 |
| 25% | 0.000000 | 0.0 | 0.000000 | 0.0 | 1.000000 | 39.275000 | 15.600000 | 190.000000 | 3550.000000 | 0.000000 | 8.307415 | -26.285460 |
| 50% | 1.000000 | 0.0 | 1.000000 | 0.0 | 1.000000 | 44.450000 | 17.300000 | 197.000000 | 4050.000000 | 1.000000 | 8.652405 | -25.833520 |
| 75% | 2.000000 | 0.0 | 1.000000 | 0.0 | 1.000000 | 48.500000 | 18.700000 | 213.000000 | 4750.000000 | 1.000000 | 9.136170 | -25.089467 |
| max | 2.000000 | 0.0 | 2.000000 | 0.0 | 1.000000 | 59.600000 | 21.500000 | 231.000000 | 6300.000000 | 1.000000 | 10.025440 | -23.787670 |
# Feature Scaling
#because I am not doing any differentiation in DT
Feature Scaling¶
#because I am not doing any differentiation in DT
[132]:
import dtale
dtale.show(df)
[132]:
# Imbalance Treatment
Imbalance Treatment¶
[137]:
sns.scatterplot(x='Culmen Length (mm)',y='Culmen Depth (mm)',data=df,hue='Species')
[137]:
<Axes: xlabel='Culmen Length (mm)', ylabel='Culmen Depth (mm)'>
#
#
[138]:
sns.pairplot(data=df,hue='Species')
plt.show()
[142]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm')
plt.show()
[ ]:
# I am getting 0.85 in heatmap but in DT we dont take care of multicollinearity,so you can ignore this
[143]:
sns.catplot(data=df,x='Species',y='Culmen Length (mm)',kind='box',col='Sex')
2024-09-25 09:23:42,441 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. 2024-09-25 09:23:42,448 - INFO - Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
[143]:
<seaborn.axisgrid.FacetGrid at 0x285db733b90>
[ ]:
# Split the data into independent and dependent variable
[147]:
x = df.iloc[:,1:]
y=df.iloc[:,0]
[148]:
x.head()
[148]:
| Region | Island | Stage | Clutch Completion | Culmen Length (mm) | Culmen Depth (mm) | Flipper Length (mm) | Body Mass (g) | Sex | Delta 15 N (o/oo) | Delta 13 C (o/oo) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2 | 0 | 1 | 39.10 | 18.7 | 181.0 | 3800.0 | 1 | 8.652405 | -25.83352 |
| 1 | 0 | 2 | 0 | 1 | 39.50 | 17.4 | 186.0 | 3800.0 | 0 | 8.949560 | -24.69454 |
| 2 | 0 | 2 | 0 | 1 | 40.30 | 18.0 | 190.0 | 3250.0 | 0 | 8.368210 | -25.33302 |
| 3 | 0 | 2 | 0 | 1 | 44.45 | 17.3 | 197.0 | 4050.0 | 1 | 8.652405 | -25.83352 |
| 4 | 0 | 2 | 0 | 1 | 36.70 | 19.3 | 193.0 | 3450.0 | 0 | 8.766510 | -25.32426 |
[149]:
y.head()
[149]:
0 0 1 0 2 0 3 0 4 0 Name: Species, dtype: int32
# Imbalance Treatment
Imbalance Treatment¶
[151]:
Selection deleted
import imblearn
from imblearn.over_sampling import RandomOverSampler
ros= RandomOverSampler()
x_over,y_over = ros.fit_resample(x,y)
print("Imbalance Datapoints:",y.value_counts())
print()
print("Balance Datapoints:",y_over.value_counts())
Imbalance Datapoints: Species 0 152 2 124 1 68 Name: count, dtype: int64 Balance Datapoints: Species 0 152 1 152 2 152 Name: count, dtype: int64
[152]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_over,y_over,random_state=1,test_size=0.25,stratify=y_over)
# Building Decision Tree Model
Building Decision Tree Model¶
[166]:
Selection deleted
from sklearn.tree import DecisionTreeClassifier
# Approach 1 - Gini
dtree1 = DecisionTreeClassifier(criterion='gini')#by default gini provided
dtree1.fit(x_train,y_train)
# Approach 1 - Entropy
dtree2 = DecisionTreeClassifier(criterion='entropy')
dtree2.fit(x_train,y_train)
[166]:
DecisionTreeClassifier(criterion='entropy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(criterion='entropy')
# Predict the test data by using both the models
Predict the test data by using both the models¶
[167]:
y_pred_dt1_train = dtree1.predict(x_train)
y_pred_dt1_test = dtree1.predict(x_test)
y_pred_dt2_train = dtree2.predict(x_train)
y_pred_dt2_test = dtree2.predict(x_test)
# Evaluate Your Model
Evaluate Your Model¶
[168]:
from sklearn.metrics import classification_report ,confusion_matrix, accuracy_score
[169]:
Selection deleted
print(classification_report(y_train,y_pred_dt1_train))
print()
print(classification_report(y_test,y_pred_dt1_test))
precision recall f1-score support
0 1.00 1.00 1.00 114
1 1.00 1.00 1.00 114
2 1.00 1.00 1.00 114
accuracy 1.00 342
macro avg 1.00 1.00 1.00 342
weighted avg 1.00 1.00 1.00 342
precision recall f1-score support
0 0.97 0.97 0.97 38
1 1.00 0.97 0.99 38
2 0.95 0.97 0.96 38
accuracy 0.97 114
macro avg 0.97 0.97 0.97 114
weighted avg 0.97 0.97 0.97 114
[170]:
print(classification_report(y_train,y_pred_dt2_train))
print()
print(classification_report(y_test,y_pred_dt2_test))
precision recall f1-score support
0 1.00 1.00 1.00 114
1 1.00 1.00 1.00 114
2 1.00 1.00 1.00 114
accuracy 1.00 342
macro avg 1.00 1.00 1.00 342
weighted avg 1.00 1.00 1.00 342
precision recall f1-score support
0 1.00 0.95 0.97 38
1 0.97 1.00 0.99 38
2 0.97 1.00 0.99 38
accuracy 0.98 114
macro avg 0.98 0.98 0.98 114
weighted avg 0.98 0.98 0.98 114
[171]:
print(confusion_matrix(y_train,y_pred_dt1_train))
print()
print(confusion_matrix(y_test,y_pred_dt1_test))
[[114 0 0] [ 0 114 0] [ 0 0 114]] [[37 0 1] [ 0 37 1] [ 1 0 37]]
[172]:
print(confusion_matrix(y_train,y_pred_dt2_train))
print()
print(confusion_matrix(y_test,y_pred_dt2_test))
[[114 0 0] [ 0 114 0] [ 0 0 114]] [[36 1 1] [ 0 38 0] [ 0 0 38]]
[176]:
print("Train Accuracy - Gini: ",accuracy_score(y_train,y_pred_dt1_train))
print()
print("Test Accuracy - Gini: ",accuracy_score(y_test,y_pred_dt1_test))
Train Accuracy - Gini: 1.0 Test Accuracy - Gini: 0.9736842105263158
[178]:
print("Train Accuracy -Entropy :",accuracy_score(y_train,y_pred_dt2_train))
print()
print("Test Accuracy -Entropy :",accuracy_score(y_test,y_pred_dt2_test))
Train Accuracy -Entropy : 1.0 Test Accuracy -Entropy : 0.9824561403508771
# Cross Validation - K_Fold_Method
Cross Validation - K_Fold_Method¶
[179]:
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(dtree1,x_train,y_train,cv=10)
[180]:
print(accuracy)
print()
print(accuracy.mean())
print()
print(accuracy.max())
[0.88571429 1. 1. 1. 0.97058824 0.94117647 1. 0.94117647 0.97058824 1. ] 0.9709243697478991 1.0
[187]:
Selection deleted
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
# Compute the confusion matrix
cm = confusion_matrix(y_test, dtree1.predict(x_test))
# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()
# Post Pruning
Post Pruning¶
[189]:
Selection deleted
dtree1.feature_importances_
[189]:
array([0. , 0.01723807, 0. , 0. , 0.04058196,
0. , 0.46088301, 0.00556215, 0. , 0. ,
0.47573481])[190]:
pd.DataFrame(index=x.columns,data=dtree1.feature_importances_,columns = ["Feature Importance"])
[190]:
| Feature Importance | |
|---|---|
| Region | 0.000000 |
| Island | 0.017238 |
| Stage | 0.000000 |
| Clutch Completion | 0.000000 |
| Culmen Length (mm) | 0.040582 |
| Culmen Depth (mm) | 0.000000 |
| Flipper Length (mm) | 0.460883 |
| Body Mass (g) | 0.005562 |
| Sex | 0.000000 |
| Delta 15 N (o/oo) | 0.000000 |
| Delta 13 C (o/oo) | 0.475735 |
[ ]:
Order of feature Importance for variables
#Delta 13 C (o/oo) 0.475735
#Flipper Length (mm) 0.460883
[191]:
from sklearn.tree import plot_tree
plt.figure(figsize=(12,8))
plot_tree(dtree1)
plt.show()
[193]:
from sklearn.tree import plot_tree
plt.figure(figsize=(12,8))
plot_tree(dtree1,filled=True,feature_names=x.columns)
plt.show()
#values is dependent variable - species
#sample - how many data is available there
# Using HypeParamete in DT to apply post pruning Method
Using HypeParamete in DT to apply post pruning Method¶
[201]:
#where feature importnacevalue is not zero
prunned_tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
prunned_tree.fit(x_train,y_train)
[201]:
DecisionTreeClassifier(max_depth=5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(max_depth=5)
# User Definded Functions
User Definded Functions¶
[202]:
def report_model(model):
model_preds = model.predict(x_test)
print(classification_report(y_test,model_preds))
print("\n")
print(accuracy_score(y_test,model_preds))
print("\n")
plt.figure(figsize=(12,8),dpi=150)
plot_tree(model,filled=True,feature_names = x.columns)
[203]:
report_model(prunned_tree)
#precision - macro avg 0.97 0.96 0.97 114
#Accuracy - 0.9649122807017544
precision recall f1-score support
0 0.97 0.95 0.96 38
1 1.00 0.97 0.99 38
2 0.93 0.97 0.95 38
accuracy 0.96 114
macro avg 0.97 0.96 0.97 114
weighted avg 0.97 0.96 0.97 114
0.9649122807017544
# Logistic Regression
Logistic Regression¶
[205]:
Selection deleted
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(x_train,y_train)
[205]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
[207]:
y_pred_train_logit = logit.predict(x_train)
y_pred_test_logit = logit.predict(x_test)
[213]:
print("Train Accuracy - Logit :", accuracy_score(y_train, y_pred_train_logit))
print() # This will print a blank line
print("Test Accuracy - Logit :", accuracy_score(y_test, y_pred_test_logit))
Train Accuracy - Logit : 0.9970760233918129 Test Accuracy - Logit : 0.9912280701754386
[ ]:
# Overfitting situation
[ ]:
# For multiclass - don't go by Logistic Regression only go by DT or you can go by Random Forest (SOTA)
# Bagging Classifier
Bagging Classifier¶
[215]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier() # base_estimator = 'DT' by default
bagging.fit(x_train,y_train)
[215]:
BaggingClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BaggingClassifier()
[217]:
y_pred_bagging_train = bagging.predict(x_train)
print()
y_pred_bagging_test = bagging.predict(x_test)
[218]:
print("Train Accuracy - Logit :", accuracy_score(y_train, y_pred_bagging_train))
print() # This will print a blank line
print("Test Accuracy - Logit :", accuracy_score(y_test, y_pred_bagging_test))
Train Accuracy - Logit : 1.0 Test Accuracy - Logit : 0.9736842105263158
# Random Forest Classifier
Random Forest Classifier¶
[225]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='entropy',n_estimators=200,bootstrap=True,oob_score=True) #no concept of base estimator #hyperparameter tuning I am building here
rf.fit(x_train,y_train)
[225]:
RandomForestClassifier(criterion='entropy', n_estimators=200, oob_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(criterion='entropy', n_estimators=200, oob_score=True)
[226]:
Selection deleted
y_pred_rf_train = rf.predict(x_train)
print()
y_pred_rf_test = rf.predict(x_test)
[227]:
print("Train Accuracy - Logit :", accuracy_score(y_train, y_pred_rf_train))
print() # This will print a blank line
print("Test Accuracy - Logit :", accuracy_score(y_test, y_pred_rf_test))
Train Accuracy - Logit : 1.0 Test Accuracy - Logit : 0.9912280701754386
# Applying Cross Validation Method
Applying Cross Validation Method¶
[228]:
Selection deleted
from sklearn.model_selection import cross_val_score
Training_Accuracy = cross_val_score(rf,x_train,y_train,cv=10)
print(Training_Accuracy)
[0.97142857 0.97142857 1. 1. 0.97058824 0.97058824 1. 0.97058824 1. 1. ]
[229]:
Selection deleted
print(Training_Accuracy.mean())
print()
print(Training_Accuracy.max())
0.9854621848739497 1.0
[ ]:
